home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Amiga Format CD 39
/
Amiga Format CD39 (1999-04-13)(Future Publishing)(GB)[!][issue 1999-05].iso
/
-seriously_amiga-
/
graphics
/
ripley
/
source
/
idct.asm
< prev
next >
Wrap
Assembly Source File
|
1999-03-02
|
12KB
|
735 lines
;********************************************************************
;
; idct.asm
;
; 1.0 - 28.7.98 (bifat) written in assembler
; 1.1 - 29.7.98 (bifat) optimized for superscalarity,
; inlined idctrow/idctcol into
; the FastIDCT main loop
; 1.2 - 30.7.98 (bifat) idctrow and idctcol heavily
; optimized for superscalar execution
; 1.3 - 2.8.98 (bifat) removed clipping. damn, i see no
; fucking difference!
; 1.4 - 1.10.98 (bifat) idctcol shortcut bug fixed
; idctrow shortcut was not detected - fixed
;
;********************************************************************
W1 EQU 2841
W2 EQU 2676
W3 EQU 2408
W5 EQU 1609
W6 EQU 1108
W7 EQU 565
;********************************************************************
section text
XDEF _Initialize_Fast_IDCT
XDEF _Fast_IDCT
;********************************************************************
iclip ds.w 512
iclp ds.w 512
;********************************************************************
_Initialize_Fast_IDCT:
movem.l a0/d0/d1,-(a7)
lea iclip(pc),a0
move.w #-256,d1
move.w #256-1,d0
.inilop move.w d1,(a0)+
dbf d0,.inilop
move.w #-255,d1
move.w #512-1,d0
.inilop2 move.w d1,(a0)+
addq.w #1,d1
dbf d0,.inilop2
move.w #255,d1
move.w #256-1,d0
.inilop3 move.w d1,(a0)+
dbf d0,.inilop3
movem.l (a7)+,a0/d0/d1
rts
;********************************************************************
idctrow: MACRO
move.l d0,a6 ; *blk
lea 2(a6),a5
move.w (a5)+,d1
moveq #11,d7
move.w (a5)+,d2
move.w d1,d0
move.w (a5)+,d3
or.w d2,d0
move.w (a5)+,d4
or.w d3,d0
move.w (a5)+,d5
or.w d4,d0
move.w (a5)+,d6
ext.l d4
or.w d5,d0
asl.l d7,d4
move.w (a5)+,d7
or.w d6,d0
move.w d7,a5
or.w d7,d0
bne.b .cont1\@
move.w (a6),d0
asl.w #3,d0
move.w d0,(a6)+
move.w d0,(a6)+
move.w d0,(a6)+
move.w d0,(a6)+
move.w d0,(a6)+
move.w d0,(a6)+
move.w d0,(a6)+
move.w d0,(a6)+
bra .ok\@
cnop 0,4
.cont1\@ moveq #11,d7
move.w (a6),d0
ext.l d0
asl.l d7,d0
moveq #-127,d7
sub.l d7,d0
move.l d0,a0
move.w d1,d0
add.w a5,d0
muls.w #W1-W7,d1
exg a5,d7
muls.w #W7,d0
muls.w #-(W1+W7),d7
exg a5,d7
add.l d0,d1
add.l d0,a5
move.l d5,d0
add.l d3,d0
muls.w #W3,d0
muls.w #-(W3-W5),d5
add.l d0,d5
muls.w #-(W3+W5),d3
add.l d0,d3
move.l a0,d0
add.l d4,d0
move.l d0,a1
sub.l d4,a0
move.l d2,d4
add.l d6,d4
muls.w #W6,d4
muls.w #-(W2+W6),d6
add.l d4,d6
muls.w #(W2-W6),d2
add.l d4,d2
move.l d1,d4
add.l d5,d4
sub.l d5,d1
move.l a5,d5
add.l d3,d5
sub.l d3,a5
move.l a1,d3
add.l d2,d3
sub.l d2,a1
move.l a0,d2
add.l d6,d2
sub.l d6,a0
move.w #-128,a2
move.l #181,d7
move.l d1,d6
add.l a5,d6
muls.l d7,d6
sub.l a5,d1
sub.l a2,d6
muls.l d7,d1
asr.l #8,d6
sub.l a2,d1
move.l d3,d0
asr.l #8,d1
move.l d2,d7
add.l d4,d0
add.l d6,d7
asr.l #8,d0
asr.l #8,d7
move.w d0,(a6)+
move.l a0,d0
move.w d7,(a6)+
add.l d1,d0
move.l a1,d7
asr.l #8,d0
add.l d5,d7
move.w d0,(a6)+
asr.l #8,d7
move.l a1,d0
move.w d7,(a6)+
sub.l d6,d2
sub.l d5,d0
asr.l #8,d2
asr.l #8,d0
move.w d0,(a6)+
move.l a0,d0
sub.l d4,d3
sub.l d1,d0
asr.l #8,d3
asr.l #8,d0
movem.w d0/d2/d3,(a6)
.ok\@
ENDM
;********************************************************************
idctcol: MACRO
move.l d0,a6 ; *blk
move.w 8*4*2(a6),d1
lea iclp(pc),a2
move.w 8*6*2(a6),d2 ; x2
move.l d1,d0
move.w 8*2*2(a6),d3 ; x3
or.w d2,d0
move.w 8*1*2(a6),d4 ; x4
or.w d3,d0
move.w 8*7*2(a6),d5 ; x5
or.w d4,d0
move.w 8*5*2(a6),d6 ; x6
or.w d5,d0
move.w 8*3*2(a6),d7 ; x7
or.w d6,d0
move.w d7,a5
or.w d7,d0
bne.b .cont1\@
moveq #32,d0
add.w 8*0*2(a6),d0
asr.w #6,d0
move.w (a2,d0.w*2),d0
move.w d0,8*0*2(a6)
move.w d0,8*1*2(a6)
move.w d0,8*2*2(a6)
move.w d0,8*3*2(a6)
move.w d0,8*4*2(a6)
move.w d0,8*5*2(a6)
move.w d0,8*6*2(a6)
move.w d0,8*7*2(a6)
bra .ok\@
cnop 0,4
.cont1\@
ext.l d1
asl.l #8,d1 ; x1
move.w 8*0*2(a6),d7
move.l d4,d0
ext.l d7
add.l d5,d0
asl.l #8,d7
muls.w #W7,d0
add.l #8192,d7
addq.l #4,d0
move.l d7,a0
muls.w #W1-W7,d4
muls.w #-(W1+W7),d5
add.l d0,d4
add.l d0,d5
asr.l #3,d4
move.l a5,d0
add.l d6,d0
muls.w #W3,d0
asr.l #3,d5
addq.l #4,d0
muls.w #-(W3-W5),d6
exg a5,d7
add.l d0,d6
muls.w #-(W3+W5),d7
asr.l #3,d6
add.l d0,d7
asr.l #3,d7
move.l a0,d0
exg a5,d7
add.l d1,d0
sub.l d1,a0
move.l d2,d1
add.l d3,d1
muls.w #W6,d1
muls.w #-(W2+W6),d2
addq.l #4,d1
add.l d1,d2
muls.w #W2-W6,d3
asr.l #3,d2
add.l d1,d3
asr.l #3,d3
move.l d6,d1
add.l d4,d1
sub.l d6,d4
move.l a5,d6
add.l d5,d6
sub.l a5,d5
move.l d0,a5
add.l d3,a5
sub.l d3,d0
move.l d2,d3
add.l a0,d3
sub.l d2,a0
move.l d0,a1
move.l d4,d2
moveq #-128,d7
add.l d5,d2
move.l #181,d0
sub.l d5,d4
muls.l d0,d2
muls.l d0,d4
sub.l d7,d2
sub.l d7,d4
asr.l #8,d2
move.l a5,d7
asr.l #8,d4
add.l d1,d7
moveq #14,d5
move.l a5,d0
asr.l d5,d7
sub.l d1,d0
move.w d7,8*0*2(a6)
asr.l d5,d0
move.l d2,d7
move.w d0,8*7*2(a6)
add.l d3,d7
move.l a0,d0
asr.l d5,d7
add.l d4,d0
move.w d7,8*1*2(a6)
asr.l d5,d0
move.l a1,d7
move.w d0,8*2*2(a6)
add.l d6,d7
move.l a1,d0
asr.l d5,d7
sub.l d6,d0
move.w d7,8*3*2(a6)
asr.l d5,d0
move.l a0,d7
move.w d0,8*4*2(a6)
sub.l d4,d7
sub.l d2,d3
asr.l d5,d7
move.w d7,8*5*2(a6)
asr.l d5,d3
move.w d3,8*6*2(a6)
.ok\@
ENDM
;********************************************************************
_Fast_IDCT:
movem.l d0-d7/a0-a6,-(a7)
move.l d0,a4
sub.l a3,a3
.rowlop
move.l a3,d0
asl.w #4,d0
add.l a4,d0
idctrow
addq.w #1,a3
cmp.w #8,a3
bne .rowlop
sub.l a3,a3
.collop
move.l a3,d0
add.w d0,d0
add.l a4,d0
idctcol
addq.w #1,a3
cmp.w #8,a3
bne .collop
skip
movem.l (a7)+,d0-d7/a0-a6
rts
END
/* idct.c, inverse fast discrete cosine transform */
/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
/*
* Disclaimer of Warranty
*
* These software programs are available to the user without any license fee or
* royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
* any and all warranties, whether express, implied, or statuary, including any
* implied warranties or merchantability or of fitness for a particular
* purpose. In no event shall the copyright-holder be liable for any
* incidental, punitive, or consequential damages of any kind whatsoever
* arising from the use of these programs.
*
* This disclaimer of warranty extends to the user of these programs and user's
* customers, employees, agents, transferees, successors, and assigns.
*
* The MPEG Software Simulation Group does not represent or warrant that the
* programs furnished hereunder are free of infringement of any third-party
* patents.
*
* Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
* are subject to royalty fees to patent holders. Many of these patents are
* general enough such that they are unavoidable regardless of implementation
* design.
*
*/
/**********************************************************/
/* inverse two dimensional DCT, Chen-Wang algorithm */
/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984) */
/* 32-bit integer arithmetic (8 bit coefficients) */
/* 11 mults, 29 adds per DCT */
/* sE, 18.8.91 */
/**********************************************************/
/* coefficients extended to 12 bit for IEEE1180-1990 */
/* compliance sE, 2.1.94 */
/**********************************************************/
/* this code assumes >> to be a two's-complement arithmetic */
/* right shift: (-2)>>1 == -1 , (-3)>>1 == -2 */
#include "config.h"
#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */
/* global declarations */
void Initialize_Fast_IDCT _ANSI_ARGS_((void));
void Fast_IDCT _ANSI_ARGS_((short *block));
/* private data */
static short iclip[1024]; /* clipping table */
static short *iclp;
/* private prototypes */
static void idctrow _ANSI_ARGS_((short *blk));
static void idctcol _ANSI_ARGS_((short *blk));
/* row (horizontal) IDCT
*
* 7 pi 1
* dst[k] = sum c[l] * src[l] * cos( -- * ( k + - ) * l )
* l=0 8 2
*
* where: c[0] = 128
* c[1..7] = 128*sqrt(2)
*/
static void idctrow(blk)
short *blk;
{
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
/* shortcut */
if (!((x1 = blk[4]<<11) | (x2 = blk[6]) | (x3 = blk[2]) |
(x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3])))
{
blk[0]=blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=blk[0]<<3;
return;
}
x0 = (blk[0]<<11) + 128; /* for proper rounding in the fourth stage */
/* first stage */
x8 = W7*(x4+x5);
x4 = x8 + (W1-W7)*x4;
x5 = x8 - (W1+W7)*x5;
x8 = W3*(x6+x7);
x6 = x8 - (W3-W5)*x6;
x7 = x8 - (W3+W5)*x7;
/* second stage */
x8 = x0 + x1;
x0 -= x1;
x1 = W6*(x3+x2);
x2 = x1 - (W2+W6)*x2;
x3 = x1 + (W2-W6)*x3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
/* third stage */
x7 = x8 + x3;
x8 -= x3;
x3 = x0 + x2;
x0 -= x2;
x2 = (181*(x4+x5)+128)>>8;
x4 = (181*(x4-x5)+128)>>8;
/* fourth stage */
blk[0] = (x7+x1)>>8;
blk[1] = (x3+x2)>>8;
blk[2] = (x0+x4)>>8;
blk[3] = (x8+x6)>>8;
blk[4] = (x8-x6)>>8;
blk[5] = (x0-x4)>>8;
blk[6] = (x3-x2)>>8;
blk[7] = (x7-x1)>>8;
}
/* column (vertical) IDCT
*
* 7 pi 1
* dst[8*k] = sum c[l] * src[8*l] * cos( -- * ( k + - ) * l )
* l=0 8 2
*
* where: c[0] = 1/1024
* c[1..7] = (1/1024)*sqrt(2)
*/
static void idctcol(blk)
short *blk;
{
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
/* shortcut */
if (!((x1 = (blk[8*4]<<8)) | (x2 = blk[8*6]) | (x3 = blk[8*2]) |
(x4 = blk[8*1]) | (x5 = blk[8*7]) | (x6 = blk[8*5]) | (x7 = blk[8*3])))
{
blk[8*0]=blk[8*1]=blk[8*2]=blk[8*3]=blk[8*4]=blk[8*5]=blk[8*6]=blk[8*7]=
iclp[(blk[8*0]+32)>>6];
return;
}
x0 = (blk[8*0]<<8) + 8192;
/* first stage */
x8 = W7*(x4+x5) + 4;
x4 = (x8+(W1-W7)*x4)>>3;
x5 = (x8-(W1+W7)*x5)>>3;
x8 = W3*(x6+x7) + 4;
x6 = (x8-(W3-W5)*x6)>>3;
x7 = (x8-(W3+W5)*x7)>>3;
/* second stage */
x8 = x0 + x1;
x0 -= x1;
x1 = W6*(x3+x2) + 4;
x2 = (x1-(W2+W6)*x2)>>3;
x3 = (x1+(W2-W6)*x3)>>3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
/* third stage */
x7 = x8 + x3;
x8 -= x3;
x3 = x0 + x2;
x0 -= x2;
x2 = (181*(x4+x5)+128)>>8;
x4 = (181*(x4-x5)+128)>>8;
/* fourth stage */
blk[8*0] = iclp[(x7+x1)>>14];
blk[8*1] = iclp[(x3+x2)>>14];
blk[8*2] = iclp[(x0+x4)>>14];
blk[8*3] = iclp[(x8+x6)>>14];
blk[8*4] = iclp[(x8-x6)>>14];
blk[8*5] = iclp[(x0-x4)>>14];
blk[8*6] = iclp[(x3-x2)>>14];
blk[8*7] = iclp[(x7-x1)>>14];
}
/* two dimensional inverse discrete cosine transform */
void Fast_IDCT(block)
short *block;
{
int i;
for (i=0; i<8; i++)
idctrow(block+8*i);
for (i=0; i<8; i++)
idctcol(block+i);
}
void Initialize_Fast_IDCT()
{
int i;
iclp = iclip+512;
for (i= -512; i<512; i++)
iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
}